Load libraries and source scripts

source("../references/biostats.R")
source("../references/iLOO.R")
`%!in%` = Negate(`%in%`)

# Add all required libraries that are installed with install.packages() here
list.of.packages <- c("RCurl", "tidyverse", "vegan", "pheatmap", "pastecs", "factoextra", "FactoMineR", "RColorBrewer", "tibble", "reshape2", "plotly", "cowplot", "clipr", "janitor", "ggpubr", "forcats", "apeglm", "car", "vsn", "devtools", "grid", "gridGraphics", "Rfast", "dendextend", "RColorBrewer", "scales", "VennDiagram", "colorspace", "edgeR", "devtools", "ggpattern", "eulerr", "Rmisc", "pairwiseAdonis")

# Add all libraries that are installed using BiocManager here
bioconductor.packages <- c("DESeq2", "WGCNA")

# Install BiocManager if needed
if(!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")

# Get names of all required packages that aren't installed
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
new.bioc.packages <- bioconductor.packages[!(bioconductor.packages %in% installed.packages()[, "Package"])]
# Install all new packages
if(length(new.packages)) install.packages(new.packages)
# if(length(new.bioc.packages)) BiocManager::install(new.bioc.packages)

# Load all required libraries
all.packages <- c(list.of.packages, bioconductor.packages)
lapply(all.packages, FUN = function(X) {
  do.call("require", list(X))
})

# # Github packages 
# install_github("pmartinezarbizu/pairwiseAdonis/pairwiseAdonis")
#### Load counts and sample info (generated in the "0-Get-data.Rmd" notebook)
load(file="../data/sample.info")
load(file = "../results/counts")
load(file = "../results/counts.t")
load(file = "../results/counts.ts")
load(file = "../results/counts.annot.tanner")
load( file = "../references/tanner.blast")
load(file = "../references/tanner.blast.GO")

Library summary stats

No. of fragments per sample?

data.frame(rowSums(counts.ts)) %>% 
                  dplyr::rename(read.total = 1) %>% 
                  rownames_to_column(var="sample") %>% #summary() 
dplyr::summarise(mean=round(mean(read.total, na.rm=T)), 
          sd=round(sd(read.total, na.rm=T)), 
          se=round(sd/sqrt(length(sample))),
          median=median(read.total), 
          min=min(read.total), 
          max=max(read.total)) %>% t() %>% as.data.frame() %>% 
  dplyr::mutate(V1=prettyNum(V1, big.mark = ",")) %>% 
  dplyr::rename("fragments.per.sample"="V1") #use this to average across all samples 
##        fragments.per.sample
## mean             25,331,103
## sd                5,316,473
## se                  675,193
## median           24,728,512
## min              14,169,939
## max              45,153,229

Did the no. of fragments per treatment differ?

Boxplots of the # of fragments in each sample per treatment
fragments <- data.frame(rowSums(counts.ts)) %>% 
                  dplyr::rename(read.total = 1) %>% 
                  rownames_to_column(var="sample") %>%
    #mutate(sample=as.numeric(sample)) %>%
  left_join(sample.info, by=c("sample"="sampleID")) %>%
  mutate(treatment=factor(treatment, ordered = T, levels=c("ambient", "moderate_short", "moderate_long", "severe_short", "severe_long")))


ggplotly(
ggplot(data = fragments %>%
         arrange(OA)) +
           geom_bar(aes(x=sample, y=read.total, fill=treatment), stat = "identity") + 
  ggtitle("Total # fragments by sample") + 
             theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) + theme_minimal()) 
Statistically test whether no. of fragments / treatment differs
hist(log(fragments$read.total))

shapiro.test(log(fragments$read.total))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(fragments$read.total)
## W = 0.94306, p-value = 0.006271
summary(aov(log(read.total) ~ treatment, fragments))
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## treatment    4 0.6594 0.16484   5.726 0.000605 ***
## Residuals   57 1.6409 0.02879                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov(log(read.total) ~ treatment, fragments))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = log(read.total) ~ treatment, data = fragments)
## 
## $treatment
##                                     diff         lwr         upr     p adj
## moderate_short-ambient        0.28938055  0.08986761  0.48889349 0.0012666
## moderate_long-ambient         0.17242633 -0.01891192  0.36376459 0.0963304
## severe_short-ambient          0.01377824 -0.18134926  0.20890575 0.9996400
## severe_long-ambient           0.11003537 -0.07799420  0.29806493 0.4734599
## moderate_long-moderate_short -0.11695422 -0.31276280  0.07885436 0.4526584
## severe_short-moderate_short  -0.27560231 -0.47511525 -0.07608937 0.0023615
## severe_long-moderate_short   -0.17934519 -0.37192189  0.01323152 0.0794315
## severe_short-moderate_long   -0.15864809 -0.34998635  0.03269017 0.1487312
## severe_long-moderate_long    -0.06239097 -0.24648525  0.12170331 0.8739603
## severe_long-severe_short      0.09625712 -0.09177244  0.28428669 0.6035428
anova(lm(log(read.total) ~ OA, fragments))
## Analysis of Variance Table
## 
## Response: log(read.total)
##           Df  Sum Sq  Mean Sq F value    Pr(>F)    
## OA         2 0.51799 0.258996  8.5737 0.0005387 ***
## Residuals 59 1.78228 0.030208                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm(log(read.total) ~ duration, fragments))
## Analysis of Variance Table
## 
## Response: log(read.total)
##           Df  Sum Sq  Mean Sq F value Pr(>F)
## duration   1 0.00038 0.000377  0.0111 0.9165
## Residuals 48 1.62910 0.033940
anova(lm(log(read.total) ~ OA*duration, fragments))
## Analysis of Variance Table
## 
## Response: log(read.total)
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## OA           1 0.32117 0.32117 12.6605 0.0008796 ***
## duration     1 0.00045 0.00045  0.0178 0.8944529    
## OA:duration  1 0.14092 0.14092  5.5549 0.0227419 *  
## Residuals   46 1.16694 0.02537                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(fragments, aes(x=OA, y=read.total)) + geom_boxplot() + theme_minimal()

ggplot(fragments, aes(x=duration, y=read.total)) + geom_boxplot() + theme_minimal()

ggplot(fragments, aes(x=treatment, y=read.total)) + geom_boxplot() + theme_minimal()

How many samples per treatment, tank, etc?

fragments %>% group_by(treatment) %>% tally()
## # A tibble: 5 × 2
##   treatment          n
##   <ord>          <int>
## 1 ambient           12
## 2 moderate_short    11
## 3 moderate_long     13
## 4 severe_short      12
## 5 severe_long       14
fragments %>% group_by(OA) %>% tally()
## # A tibble: 3 × 2
##   OA           n
##   <fct>    <int>
## 1 ambient     12
## 2 moderate    24
## 3 severe      26
fragments %>% group_by(duration) %>% tally()
## # A tibble: 3 × 2
##   duration     n
##   <fct>    <int>
## 1 long        27
## 2 short       23
## 3 <NA>        12
fragments %>% group_by(treatment) %>% tally() #%>% arrange(treatment, tank)
## # A tibble: 5 × 2
##   treatment          n
##   <ord>          <int>
## 1 ambient           12
## 2 moderate_short    11
## 3 moderate_long     13
## 4 severe_short      12
## 5 severe_long       14

How many genes total in the filtered count matrix?

prettyNum(ncol(counts.ts),big.mark = ",")
## [1] "11,491"

What’s the average no. of genes per sample, etc.?

data.frame(rowSums(counts.ts != 0)) %>% 
                  dplyr::rename(count.total = 1) %>% 
                  rownames_to_column(var="sample") %>% 
  summarise(mean=round(mean(count.total, na.rm=T)), 
            sd=round(sd(count.total, na.rm=T)), 
            se=round(sd/sqrt(length(sample))),
            median=median(count.total, na.rm=T),
            min=min(count.total, na.rm=T), 
            max=max(count.total, na.rm=T)) %>% t() %>% as.data.frame() %>% 
  mutate(V1=prettyNum(V1, big.mark = ",")) %>% 
  dplyr::rename("genes.per.sample"="V1")
##        genes.per.sample
## mean             11,140
## sd                  103
## se                   13
## median           11,173
## min              10,779
## max              11,267

Total no. of genes identified in each sample

#ggplotly(
ggplot(data = data.frame(rowSums(counts.t != 0, na.rm=TRUE)) %>% 
                  dplyr::rename(count.total = 1) %>% 
                  rownames_to_column(var="sample")) +
           geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total # genes by sample") + 
             theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())#) 

### Prep data for analyses

Merge sample key info to count data, then sort, and generate heat map for initial inspection by treatment

# IMPORTANT NOTE: to use data for ALL genes, use object "counts.t". To remove low-frequency genes, use object "counts.ts" (recommended option)

# merge count data with sample key, reset row names as sample names, and arrange by infection, then temperature, then day 
counts.tk <- merge(x=sample.info[,c("sampleID", "OA", "duration", "treatment")], by.x="sampleID", y=counts.ts, by.y="row.names") %>% 
  arrange(treatment)  %>% column_to_rownames(var="sampleID") %>% droplevels()
save(counts.tk, file = "../results/counts.tk")
head(counts.tk[1:24]) #check out results of merge/arrange
##          OA duration treatment gene_22760 gene_22761 gene_22085 gene_22086
## S1  ambient     <NA>   ambient        114          7         98        144
## S10 ambient     <NA>   ambient         20         43        227        304
## S11 ambient     <NA>   ambient        259         27        107        174
## S12 ambient     <NA>   ambient        494         14        128        175
## S2  ambient     <NA>   ambient        239          4        169        233
## S3  ambient     <NA>   ambient        168          1        223        275
##     gene_22090 gene_22092 gene_17987 gene_17988 gene_17989 gene_20323
## S1          98         96        232        194         62         83
## S10        131        165        411        463        146        195
## S11        123        170        308        266         78        112
## S12        115         98        180        238         66         57
## S2         130        158        327        339        116        104
## S3         127        162        441        383        122        324
##     gene_15688 gene_24921 gene_46880 gene_46384 gene_41869 gene_3171 gene_3170
## S1          73         75         28         18         40        53        36
## S10        177         90         48         29         66       148        75
## S11        131        106         34         22         50        62        45
## S12        100        139         30         11         53        49        30
## S2         141         76         25         14         39        96        50
## S3         161         90         21         13         39        95        39
##     gene_3172 gene_3157 gene_4558 gene_4559
## S1         30        43       131       328
## S10       105        43       166       536
## S11        41        86       166       529
## S12        24        44       335       402
## S2         78        34       105       344
## S3         74        57        93       401

Reformat for DESeq, ensure correct sample order for

# NOTE: It is absolutely critical that the **columns of the count matrix** and the **rows of the column data (information about samples)** are in the same order. DESeq2 will not make guesses as to which column of the count matrix belongs to which row of the column data, these must be provided to DESeq2 already in consistent order.

all(rownames(counts.tk) == counts.tk[-1:-3] %>% t() %>% colnames()) #check that rownames of untransformed matrix match column names of transformed matrix. Should print 'TRUE' 
## [1] TRUE
##### If using counts which included multimapped counted fractionally, need to round to the nearest integer for DESeq2. 
#_Not needed for gene counts produced by STAR aligner_

#counts.tk <- counts.tk %>% mutate_if(is.numeric, round)

Generate DESeq dataset object

dds.treatment <- DESeqDataSetFromMatrix(countData = counts.tk[-1:-3] %>% t(),
                              colData = counts.tk[,"treatment", drop=FALSE] ,
                              design = ~ treatment)

Transform data for visualization

The variance stabilizing transformation is recommended prior to visualizing expression data and running PCA’s

vsd.treatment <- varianceStabilizingTransformation(dds.treatment, blind=FALSE)
save(vsd.treatment, file = "../results/vsd.treatment")

# Save variance-stabilization normalized gene counts x sample matrix, annotated for later use 
counts.trans <- assay(vsd.treatment) %>% t() %>% as.matrix() %>% as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(sample.info[,c("sampleID", "treatment")]) %>%
  column_to_rownames(var = "sampleID") %>% 
  dplyr::select(treatment, everything()) %>% 
  mutate(treatment=factor(treatment, ordered = T, 
                          levels=c("ambient","moderate_short","moderate_long","severe_short","severe_long")))
save(counts.trans, file = "../results/counts.trans")

Unsupervised analyses

Heat map with top 10% most variable genes, VSD-transformed counts

# calculate CV for each gene across all samples 
temp <-  assay(vsd.treatment) %>% as.data.frame() %>%
  rownames_to_column(var = "gene") %>%
  rowwise() %>% 
  dplyr::mutate(
     sd = sd(c_across(-gene)),
     mean = mean(c_across(-gene)),
     cv = sd/mean) %>%
  #column_to_rownames("gene") %>%
  arrange(dplyr::desc(cv))

# subset top 10% of most variable genes 
most.variable <- temp %>%
  head(n=round(0.1*nrow(assay(vsd.treatment)))) %>%
  select(gene) %>% unlist() %>% as.vector()

# Generate heat map / cluster those top 10% most variable genes - any pattern in the clustering? 
cluster.most.variable <- pheatmap(t(assay(vsd.treatment))[,most.variable], 
         Rowv=NA, Colv=NA, na.rm = TRUE, xlab = NA, scale="column",
                     show_colnames =FALSE, cluster_cols = FALSE, cluster_rows = TRUE,
         annotation_colors = list(treatment=
          c(`ambient`= "navyblue", 
            `moderate_short`="darkgreen", `moderate_long`="yellow3",
            `severe_short`="sienna2", `severe_long`="firebrick4")),
         annotation_row=as.data.frame(colData(vsd.treatment)[,c("treatment"), drop=FALSE]),
         main = "gene counts - top 10% most variable\n(VSD-transformed)")

Hierarchical clustering

NOTE: this tree was generated by the pheatmap function in the previous code chunk

# Just plot hierarchical cluster, color-code sample by treatment, or by tank number
cluster.dendo <- as.dendrogram(cluster.most.variable$tree_row) 

##bare bones dendogram
#plot(cluster.dendo)

# #A sub tree - looking at branches
# par(cex = 1)
# plot(cluster.dendo[[1]], horiz = TRUE)

order.dendrogram(cluster.dendo) # to find out the order of the sample labels 
##  [1] 26 55 52  2  6 41 61 10 36 51  7 22 32 33 45 53 57 58 38 59 60 11 20 34  1
## [26]  8 28 40 39 62  9 54 30  4 13 23 21 24 18 27 16 25  5 48 12  3 37 42 35 44
## [51] 50 56 15 46 47 14 31 29 43 49 17 19
# Create a dataframe ordered the same as the dendogram for annotation purposes 
dendo.df <- sample.info[match(cluster.most.variable$tree_row$labels, sample.info$sampleID),c("sampleID", "treatment")]
all(dendo.df$sample == cluster.most.variable$tree_row$labels) # Should == TRUE
## [1] TRUE
# Create color coding vectors 
colorCodes.treat <- c(`ambient`= "navyblue", 
            `moderate_short`="darkgreen", `moderate_long`="yellow3",
            `severe_short`="sienna2", `severe_long`="firebrick4")

# Plot dendogram color-code by treatment
labels(cluster.dendo) <- dendo.df$sampleID[order.dendrogram(cluster.dendo)]
labels_colors(cluster.dendo) <- colorCodes.treat[as.character(dendo.df$treatment[order.dendrogram(cluster.dendo)])]
plot(cluster.dendo, xlab="Sample Number", cex.lab=0.9)
legend("topright", title = "Treatment", bty="n", legend =
         c("ambient","moderate_short","moderate_long","severe_short","severe_long"), 
       fill = c("navyblue","darkgreen","yellow3","sienna2", "firebrick4"))

Perform PCA’s

# To view the the guts of the PlotPCA function in DESeq2, execute this code chunk. It reveals that, by default, it only uses the top 500 most influential genes to plot the PCA! 

#getMethod("plotPCA","DESeqTransform")

PCAs using DESeq2

NOTE: Hover over points to see the sample numbers

# Generate PCA with points color coded by pH Treatment with various number of top genes to use for principal components, selected by highest row variance  

# Top 500 most influential genes
ggplotly(
  plotPCA(vsd.treatment, intgroup="treatment", ntop=500) +
           ggtitle("PCA by Treatment (var-stabilizing transformed)\ntop 500 genes") +
    geom_point(size=2, aes(text=colnames(vsd.treatment))) +
      scale_color_manual(values=c("ambient"= "navyblue", 
            "moderate_short"="darkgreen", "moderate_long"="yellow3",
            "severe_short"="sienna2", "severe_long"="firebrick4")) +
      theme_minimal()+ stat_ellipse() , tooltip = "text")
# Top 2000 most influential genes
ggplotly(
  plotPCA(vsd.treatment, intgroup="treatment", ntop=2000) +
           ggtitle("PCA by Treatment (var-stabilizing transformed)\ntop 2000 genes") +
    geom_point(size=2, aes(text=colnames(vsd.treatment))) +
      scale_color_manual(values=c("ambient"= "navyblue", 
            "moderate_short"="darkgreen", "moderate_long"="yellow3",
            "severe_short"="sienna2", "severe_long"="firebrick4")) +
      theme_minimal()+ stat_ellipse() , tooltip = "text")
# All genes! 
#ggplotly(
  plotPCA(vsd.treatment, intgroup="treatment", ntop=nrow(assay(vsd.treatment))) + 
           ggtitle("PCA by Treatment (var-stabilizing transformed)\nall genes") + 
    geom_point(size=3, aes(text=colnames(vsd.treatment))) + 
      scale_color_manual(values=c("ambient"= "navyblue", 
            "moderate_short"="darkgreen", "moderate_long"="yellow3",
            "severe_short"="sienna2", "severe_long"="firebrick4")) +
    theme_minimal()+ stat_ellipse() #, tooltip = "text")

# PCA.data.treatment <- plotPCA(vsd.treatment, intgroup=c("treatment"), returnData=TRUE)
# save(PCA.data.treatment, file="../results/PCA.data.treatment")

Run PCAs using prcomp

This enables screeplot to ID significant PCs, calculates variance, etc.

#pca.princomp <- prcomp(cov(assay(vsd.pH)), scale=F) #scale=F for variance-covariance matrix
pca.princomp <- prcomp(t(assay(vsd.treatment))) # using the same code that's under the hood of PlotPCA but using all genes 
pca.eigenval(pca.princomp) #The Proporation of Variance = %variance 
## Importance of components:
##                                 PC1         PC2         PC3         PC4
## Variance(eigenvalue)   1278.6133648 417.7152980 393.7549427 200.9273154
## Proportion of Variance    0.2682302   0.0876292   0.0826027   0.0421510
## Cumulative Proportion     0.2682302   0.3558594   0.4384621   0.4806131
## Broken-stick value        4.7123929   3.7123929   3.2123929   2.8790596
##                                PC5         PC6         PC7         PC8
## Variance(eigenvalue)   183.4703175 162.1425339 131.8860721 116.3797974
## Proportion of Variance   0.0384888   0.0340146   0.0276673   0.0244144
## Cumulative Proportion    0.5191018   0.5531164   0.5807838   0.6051982
## Broken-stick value       2.6290596   2.4290596   2.2623929   2.1195357
##                               PC9       PC10       PC11       PC12       PC13
## Variance(eigenvalue)   83.2244056 70.5070223 62.9660854 59.4698005 57.9788822
## Proportion of Variance  0.0174590  0.0147911  0.0132092  0.0124757  0.0121629
## Cumulative Proportion   0.6226572  0.6374483  0.6506574  0.6631331  0.6752961
## Broken-stick value      1.9945357  1.8834246  1.7834246  1.6925155  1.6091822
##                              PC14       PC15       PC16       PC17       PC18
## Variance(eigenvalue)   53.4711597 50.0573103 48.5971212 46.0268886 45.4766016
## Proportion of Variance  0.0112173  0.0105011  0.0101948  0.0096556  0.0095402
## Cumulative Proportion   0.6865133  0.6970145  0.7072093  0.7168649  0.7264051
## Broken-stick value      1.5322591  1.4608306  1.3941639  1.3316639  1.2728404
##                              PC19       PC20       PC21       PC22       PC23
## Variance(eigenvalue)   43.9067697 43.5326922 42.2451034 39.2892742 39.0733730
## Proportion of Variance  0.0092109  0.0091324  0.0088623  0.0082422  0.0081969
## Cumulative Proportion   0.7356159  0.7447483  0.7536106  0.7618528  0.7700497
## Broken-stick value      1.2172848  1.1646532  1.1146532  1.0670342  1.0215796
##                              PC24       PC25       PC26       PC27       PC28
## Variance(eigenvalue)   38.0791959 37.4023795 36.5726346 35.8129517 35.3161075
## Proportion of Variance  0.0079883  0.0078463  0.0076723  0.0075129  0.0074087
## Cumulative Proportion   0.7780380  0.7858843  0.7935566  0.8010695  0.8084782
## Broken-stick value      0.9781014  0.9364347  0.8964347  0.8579732  0.8209361
##                              PC29       PC30       PC31       PC32       PC33
## Variance(eigenvalue)   34.4481880 33.6746437 33.3057780 32.8500197 32.4635482
## Proportion of Variance  0.0072266  0.0070643  0.0069870  0.0068913  0.0068103
## Cumulative Proportion   0.8157048  0.8227692  0.8297561  0.8366475  0.8434577
## Broken-stick value      0.7852218  0.7507391  0.7174058  0.6851477  0.6538977
##                              PC34       PC35       PC36       PC37       PC38
## Variance(eigenvalue)   32.0611927 31.3890215 30.9897222 30.8702842 30.6381446
## Proportion of Variance  0.0067259  0.0065849  0.0065011  0.0064760  0.0064273
## Cumulative Proportion   0.8501836  0.8567685  0.8632695  0.8697456  0.8761729
## Broken-stick value      0.6235947  0.5941829  0.5656115  0.5378337  0.5108067
##                              PC39       PC40       PC41       PC42       PC43
## Variance(eigenvalue)   29.4863033 29.2467869 28.6886558 28.3800150 27.9185756
## Proportion of Variance  0.0061857  0.0061355  0.0060184  0.0059536  0.0058568
## Cumulative Proportion   0.8823586  0.8884941  0.8945124  0.9004661  0.9063229
## Broken-stick value      0.4844909  0.4588498  0.4338498  0.4094596  0.3856501
##                              PC44       PC45       PC46       PC47       PC48
## Variance(eigenvalue)   27.6088934 27.0919523 27.0708401 26.8234106 26.3636479
## Proportion of Variance  0.0057919  0.0056834  0.0056790  0.0056271  0.0055306
## Cumulative Proportion   0.9121147  0.9177981  0.9234771  0.9291042  0.9346348
## Broken-stick value      0.3623943  0.3396670  0.3174448  0.2957056  0.2744290
##                              PC49       PC50       PC51       PC52       PC53
## Variance(eigenvalue)   26.0594552 25.5637854 25.3217066 25.1792380 24.8850941
## Proportion of Variance  0.0054668  0.0053628  0.0053120  0.0052822  0.0052204
## Cumulative Proportion   0.9401016  0.9454644  0.9507765  0.9560586  0.9612791
## Broken-stick value      0.2535957  0.2331875  0.2131875  0.1935797  0.1743489
##                              PC54       PC55       PC56       PC57       PC58
## Variance(eigenvalue)   24.3344616 24.0766732 23.4110228 23.2293435 22.8689086
## Proportion of Variance  0.0051049  0.0050509  0.0049112  0.0048731  0.0047975
## Cumulative Proportion   0.9663840  0.9714349  0.9763461  0.9812192  0.9860167
## Broken-stick value      0.1554810  0.1369625  0.1187807  0.1009235  0.0833797
##                              PC59       PC60       PC61     PC62
## Variance(eigenvalue)   22.6844284 22.2705331 21.7015548 0.000000
## Proportion of Variance  0.0047588  0.0046720  0.0045526 0.000000
## Cumulative Proportion   0.9907754  0.9954474  1.0000000 1.000000
## Broken-stick value      0.0661383  0.0491891  0.0325225 0.016129
pc.percent <- pca.eigenval(pca.princomp)[2,1:6]*100
## Importance of components:
screeplot(pca.princomp, bstick=TRUE) #looks like PC 1-3 are significant 

pc.percent[1:3] %>% sum() #total variance explained by first 3 (significant) axes
## [1] 43.84621

Annotate PCA results

#### Generate dataframe with prcomp results 
tab.expr <- data.frame(sample.id = colnames(assay(vsd.treatment)),
    EV1.all = pca.princomp$x[,1],    # the first eigenvector
    EV2.all = pca.princomp$x[,2],    # the second eigenvector
    EV3.all = pca.princomp$x[,3],    # the third eigenvector
    stringsAsFactors = FALSE)
tab.expr.annot <- left_join(tab.expr, sample.info, by=c("sample.id"="sampleID")) %>% droplevels() #add treatment info to PC results
#pdf(file="../results/PCA-1x2.pdf", height = 5.25, width = 7.5)
ggscatter(tab.expr.annot, 
          x="EV1.all", y="EV2.all", col="treatment", shape="OA", size=3.5, alpha=0.85, 
          ellipse = FALSE, star.plot = FALSE) +
  theme_minimal() + ggtitle("Global gene expression PC1xPC2") + 
  xlab(paste("PC1 (", round(pc.percent[1], digits = 1), "%)", sep="")) + 
  ylab(paste("PC2 (", round(pc.percent[2], digits = 1), "%)", sep="")) + 
  theme(legend.position = "right", legend.text=element_text(size=8), legend.title=element_text(size=9)) + 
  scale_color_manual(name="Treatment",
                     values=c("ambient"= "navyblue",
                              "moderate_short"="darkgreen",
                              "moderate_long"="yellow3",
                              "severe_short"="sienna2",
                              "severe_long"="firebrick4"),
                     labels=c("ambient"= "Control",
                              "moderate_short"="pH 7.8, Short Exposure",
                              "moderate_long"="pH 7.8, Long Exposure",
                              "severe_short"="pH 7.5, Short Exposure",
                              "severe_long"="pH 7.5, Long Exposure")) +
  scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) + 
  guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))

#dev.off()
#pdf(file="../results/PCA-1x3.pdf", height = 5.25, width = 7.5)
ggscatter(tab.expr.annot, 
          x="EV1.all", y="EV3.all", col="treatment", shape="OA", size=3.5, alpha=0.85, 
          ellipse = FALSE, star.plot = FALSE) +
  theme_minimal() + ggtitle("Global gene expression PC1xPC3") + 
  xlab(paste("PC1 (", round(pc.percent[1], digits = 1), "%)", sep="")) + 
  ylab(paste("PC3 (", round(pc.percent[3], digits = 1), "%)", sep="")) + 
  theme(legend.position = "right", legend.text=element_text(size=8), legend.title=element_text(size=9)) + 
  scale_color_manual(name="Treatment",
                     values=c("ambient"= "navyblue",
                              "moderate_short"="darkgreen",
                              "moderate_long"="yellow3",
                              "severe_short"="sienna2",
                              "severe_long"="firebrick4"),
                     labels=c("ambient"= "Control",
                              "moderate_short"="pH 7.8, Short Exposure",
                              "moderate_long"="pH 7.8, Long Exposure",
                              "severe_short"="pH 7.5, Short Exposure",
                              "severe_long"="pH 7.5, Long Exposure")) +
  scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) + 
  guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))

#dev.off()
#pdf(file="../results/PCA-2x3.pdf", height = 5.25, width = 7.5)
ggscatter(tab.expr.annot, 
          x="EV2.all", y="EV3.all", col="treatment", shape="OA", size=3.5, alpha=0.85, 
          ellipse = FALSE, star.plot = FALSE) +
  theme_minimal() + ggtitle("Global gene expression PC2xPC3") + 
  xlab(paste("PC2 (", round(pc.percent[2], digits = 1), "%)", sep="")) + 
  ylab(paste("PC3 (", round(pc.percent[3], digits = 1), "%)", sep="")) + 
  theme(legend.position = "right", legend.text=element_text(size=8), legend.title=element_text(size=9)) + 
  scale_color_manual(name="Treatment",
                     values=c("ambient"= "navyblue",
                              "moderate_short"="darkgreen",
                              "moderate_long"="yellow3",
                              "severe_short"="sienna2",
                              "severe_long"="firebrick4"),
                     labels=c("ambient"= "Control",
                              "moderate_short"="pH 7.8, Short Exposure",
                              "moderate_long"="pH 7.8, Long Exposure",
                              "severe_short"="pH 7.5, Short Exposure",
                              "severe_long"="pH 7.5, Long Exposure")) +
  scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) + 
  guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))

#dev.off()

Generate PCA using 10% most variable genes using prcomp

pca.princomp.var <- prcomp(t(assay(vsd.treatment)[most.variable,])) # using the same code that's under the hood of PlotPCA 
pca.eigenval(pca.princomp.var) #The Proporation of Variance = %variance 
## Importance of components:
##                                PC1         PC2         PC3         PC4
## Variance(eigenvalue)   707.2304441 332.2734110 225.2619886 118.4688183
## Proportion of Variance   0.3150528   0.1480192   0.1003484   0.0527748
## Cumulative Proportion    0.3150528   0.4630720   0.5634204   0.6161952
## Broken-stick value       4.7123929   3.7123929   3.2123929   2.8790596
##                               PC5        PC6        PC7        PC8        PC9
## Variance(eigenvalue)   82.0108313 58.9954983 50.0824286 33.4080096 28.3348510
## Proportion of Variance  0.0365337  0.0262810  0.0223104  0.0148824  0.0126224
## Cumulative Proportion   0.6527289  0.6790099  0.7013203  0.7162027  0.7288252
## Broken-stick value      2.6290596  2.4290596  2.2623929  2.1195357  1.9945357
##                              PC10       PC11       PC12       PC13       PC14
## Variance(eigenvalue)   26.2804821 23.4192726 23.2255705 22.1982289 20.1591584
## Proportion of Variance  0.0117073  0.0104327  0.0103464  0.0098887  0.0089804
## Cumulative Proportion   0.7405324  0.7509651  0.7613115  0.7712002  0.7801806
## Broken-stick value      1.8834246  1.7834246  1.6925155  1.6091822  1.5322591
##                              PC15       PC16       PC17       PC18       PC19
## Variance(eigenvalue)   18.8994541 18.7351274 17.6345843 17.3859909 16.4888519
## Proportion of Variance  0.0084192  0.0083460  0.0078558  0.0077450  0.0073454
## Cumulative Proportion   0.7885998  0.7969459  0.8048016  0.8125466  0.8198920
## Broken-stick value      1.4608306  1.3941639  1.3316639  1.2728404  1.2172848
##                              PC20       PC21       PC22       PC23       PC24
## Variance(eigenvalue)   15.7071416 15.4145265 15.0747853 14.2694287 13.9591999
## Proportion of Variance  0.0069971  0.0068668  0.0067154  0.0063567  0.0062185
## Cumulative Proportion   0.8268891  0.8337559  0.8404713  0.8468280  0.8530464
## Broken-stick value      1.1646532  1.1146532  1.0670342  1.0215796  0.9781014
##                              PC25       PC26       PC27       PC28       PC29
## Variance(eigenvalue)   13.4359831 13.1699022 12.9891172 12.5184569 12.1486413
## Proportion of Variance  0.0059854  0.0058669  0.0057863  0.0055766  0.0054119
## Cumulative Proportion   0.8590318  0.8648986  0.8706850  0.8762616  0.8816735
## Broken-stick value      0.9364347  0.8964347  0.8579732  0.8209361  0.7852218
##                              PC30       PC31       PC32       PC33       PC34
## Variance(eigenvalue)   11.8194865 11.2711133 11.1031844 10.9081864 10.7906023
## Proportion of Variance  0.0052653  0.0050210  0.0049462  0.0048593  0.0048069
## Cumulative Proportion   0.8869388  0.8919598  0.8969060  0.9017653  0.9065722
## Broken-stick value      0.7507391  0.7174058  0.6851477  0.6538977  0.6235947
##                              PC35       PC36      PC37      PC38      PC39
## Variance(eigenvalue)   10.4054008 10.2423509 9.8773929 9.6527840 9.5140338
## Proportion of Variance  0.0046353  0.0045627 0.0044001 0.0043001 0.0042383
## Cumulative Proportion   0.9112075  0.9157702 0.9201704 0.9244704 0.9287087
## Broken-stick value      0.5941829  0.5656115 0.5378337 0.5108067 0.4844909
##                             PC40      PC41      PC42      PC43      PC44
## Variance(eigenvalue)   9.2568041 9.0674484 8.8981152 8.4811703 8.4329872
## Proportion of Variance 0.0041237 0.0040393 0.0039639 0.0037781 0.0037567
## Cumulative Proportion  0.9328324 0.9368717 0.9408356 0.9446137 0.9483704
## Broken-stick value     0.4588498 0.4338498 0.4094596 0.3856501 0.3623943
##                             PC45      PC46      PC47      PC48      PC49
## Variance(eigenvalue)   8.2720775 7.9545000 7.8875604 7.8274094 7.6718839
## Proportion of Variance 0.0036850 0.0035435 0.0035137 0.0034869 0.0034176
## Cumulative Proportion  0.9520554 0.9555989 0.9591126 0.9625995 0.9660171
## Broken-stick value     0.3396670 0.3174448 0.2957056 0.2744290 0.2535957
##                             PC50      PC51      PC52      PC53      PC54
## Variance(eigenvalue)   7.2871059 7.1603395 6.9113492 6.8289192 6.6452737
## Proportion of Variance 0.0032462 0.0031897 0.0030788 0.0030421 0.0029603
## Cumulative Proportion  0.9692633 0.9724531 0.9755319 0.9785740 0.9815343
## Broken-stick value     0.2331875 0.2131875 0.1935797 0.1743489 0.1554810
##                             PC55      PC56      PC57      PC58      PC59
## Variance(eigenvalue)   6.5279829 6.3965981 6.1602060 6.0503503 5.7049364
## Proportion of Variance 0.0029080 0.0028495 0.0027442 0.0026953 0.0025414
## Cumulative Proportion  0.9844424 0.9872919 0.9900361 0.9927314 0.9952728
## Broken-stick value     0.1369625 0.1187807 0.1009235 0.0833797 0.0661383
##                             PC60      PC61     PC62
## Variance(eigenvalue)   5.4602775 5.1513878 0.000000
## Proportion of Variance 0.0024324 0.0022948 0.000000
## Cumulative Proportion  0.9977052 1.0000000 1.000000
## Broken-stick value     0.0491891 0.0325225 0.016129
pc.percent.var <- pca.eigenval(pca.princomp.var)[2,1:5]*100
## Importance of components:
pc.percent.var[1:3] %>% sum()
## [1] 56.34204
screeplot(pca.princomp.var, bstick=TRUE) #looks like PC 1-3 are significant 

tab.expr.var <- data.frame(sample.id = colnames(assay(vsd.treatment[most.variable,])),
    EV1.all = pca.princomp.var$x[,1],    # the first eigenvector
    EV2.all = pca.princomp.var$x[,2],    # the second eigenvector
    EV3.all = pca.princomp.var$x[,3],    # the third eigenvector
    stringsAsFactors = FALSE)
#shapiro.test(pca.princomp$x) #sample size too large for shapiro test which is weird 
#hist(pca.princomp$x) #normal? hard to say, maybe
tab.expr.annot.var <- left_join(tab.expr.var, sample.info, by=c("sample.id"="sampleID")) %>% droplevels()

# GGSCATTER with ellipses and stars - PC1 PC2
ggscatter(tab.expr.annot.var, 
          x="EV1.all", y="EV2.all", col="treatment", shape="OA", size=3.5, alpha=0.85, 
          ellipse = FALSE, star.plot = FALSE) +
  theme_minimal() + ggtitle("Gene Expression PC1xPC2\n10% most variable genes") + 
  xlab(paste("PC1 (", round(pc.percent.var[1], digits = 1), "%)", sep="")) + 
  ylab(paste("PC2 (", round(pc.percent.var[2], digits = 1), "%)", sep="")) + 
  labs(color="OA Treatment") + 
  theme(legend.position = "right") + 
  scale_color_manual(name="Treatment",
                     values=c("ambient"= "navyblue",
                              "moderate_short"="darkgreen",
                              "moderate_long"="yellow3",
                              "severe_short"="sienna2",
                              "severe_long"="firebrick4"),
                     labels=c("ambient"= "Control",
                              "moderate_short"="pH 7.8, Short Exposure",
                              "moderate_long"="pH 7.8, Long Exposure",
                              "severe_short"="pH 7.5, Short Exposure",
                              "severe_long"="pH 7.5, Long Exposure")) +
  scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) + 
  guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))

# GGSCATTER with ellipses and stars - PC1 PC3
ggscatter(tab.expr.annot.var, 
          x="EV1.all", y="EV3.all", col="treatment", shape="OA", size=3.5, alpha=0.85, 
          ellipse = FALSE, star.plot = FALSE) +
  theme_minimal() + ggtitle("Gene Expression PC1xPC3\n10% most variable genes") + 
  xlab(paste("PC1 (", round(pc.percent.var[1], digits = 1), "%)", sep="")) + 
  ylab(paste("PC3 (", round(pc.percent.var[3], digits = 1), "%)", sep="")) + 
  labs(color="OA Treatment") + 
  theme(legend.position = "right") + 
  scale_color_manual(name="Treatment",
                     values=c("ambient"= "navyblue",
                              "moderate_short"="darkgreen",
                              "moderate_long"="yellow3",
                              "severe_short"="sienna2",
                              "severe_long"="firebrick4"),
                     labels=c("ambient"= "Control",
                              "moderate_short"="pH 7.8, Short Exposure",
                              "moderate_long"="pH 7.8, Long Exposure",
                              "severe_short"="pH 7.5, Short Exposure",
                              "severe_long"="pH 7.5, Long Exposure")) +
  scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) + 
  guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))

# GGSCATTER with ellipses and stars - PC2 PC3
ggscatter(tab.expr.annot.var, 
          x="EV2.all", y="EV3.all", col="treatment", shape="OA", size=3.5, alpha=0.85, 
          ellipse = FALSE, star.plot = FALSE) +
  theme_minimal() + ggtitle("Gene Expression PC2xPC3\n10% most variable genes") + 
  xlab(paste("PC2 (", round(pc.percent.var[2], digits = 1), "%)", sep="")) + 
  ylab(paste("PC3 (", round(pc.percent.var[3], digits = 1), "%)", sep="")) + 
  labs(color="OA Treatment") + 
  theme(legend.position = "right") + 
  scale_color_manual(name="Treatment",
                     values=c("ambient"= "navyblue",
                              "moderate_short"="darkgreen",
                              "moderate_long"="yellow3",
                              "severe_short"="sienna2",
                              "severe_long"="firebrick4"),
                     labels=c("ambient"= "Control",
                              "moderate_short"="pH 7.8, Short Exposure",
                              "moderate_long"="pH 7.8, Long Exposure",
                              "severe_short"="pH 7.5, Short Exposure",
                              "severe_long"="pH 7.5, Long Exposure")) +
  scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) + 
  guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))

perMANOVA - multivariate analysis of variance

# Effect of treatment
vsd.t <- t(assay(vsd.treatment)) %>% as.data.frame()
perm <- adonis(vsd.t ~ treatment, data=sample.info %>% remove_rownames() %>% column_to_rownames("sampleID"), permutations = 1000, method="bray")

# Pairwise comparisons
vsd.t <- sample.info %>% dplyr::select(sampleID, OA, duration, treatment) %>% left_join(t(assay(vsd.treatment)) %>% as.data.frame() %>% rownames_to_column("sampleID"))

# Clustering by OA not considering exposure time? 
pairwise.adonis(vsd.t[,-1:-4], vsd.t$OA, perm = 1000)
##                 pairs Df   SumsOfSqs  F.Model         R2     p.value p.adjusted
## 1   ambient vs severe  1 0.001556803 1.843721 0.04871933 0.049950050 0.14985015
## 2 ambient vs moderate  1 0.002722418 3.312499 0.08877720 0.003996004 0.01198801
## 3  severe vs moderate  1 0.001275948 1.580725 0.03188185 0.066933067 0.20079920
##   sig
## 1    
## 2   .
## 3
# Ambient (i.e. control) vs. moderate appear to differ in multivariate space (not considering exposure time) 

# Clustering by OA considering exposure time? 
pairwise.adonis(vsd.t[,-1:-4], vsd.t$treatment, perm = 1000)
##                              pairs Df   SumsOfSqs  F.Model         R2
## 1           ambient vs severe_long  1 0.001807829 2.180610 0.08329104
## 2          ambient vs severe_short  1 0.001143298 1.322092 0.05668840
## 3         ambient vs moderate_long  1 0.003029834 4.027043 0.14900052
## 4        ambient vs moderate_short  1 0.001834446 2.051760 0.08900664
## 5      severe_long vs severe_short  1 0.001273684 1.581716 0.06182994
## 6     severe_long vs moderate_long  1 0.001115312 1.583724 0.05957493
## 7    severe_long vs moderate_short  1 0.001173246 1.414508 0.05793720
## 8    severe_short vs moderate_long  1 0.002112709 2.903888 0.11210241
## 9   severe_short vs moderate_short  1 0.001099456 1.268272 0.05695422
## 10 moderate_long vs moderate_short  1 0.001660921 2.216647 0.09153402
##        p.value p.adjusted sig
## 1  0.043956044 0.43956044    
## 2  0.181818182 1.00000000    
## 3  0.000999001 0.00999001   *
## 4  0.043956044 0.43956044    
## 5  0.087912088 0.87912088    
## 6  0.091908092 0.91908092    
## 7  0.108891109 1.00000000    
## 8  0.002997003 0.02997003   .
## 9  0.183816184 1.00000000    
## 10 0.007992008 0.07992008
# Yes, ambient vs moderate_long differ, and severe_short vs moderate_long

Differential Expression Analysis

This is the core DESeq2 analysis!

# these are the treatment "levels" that will be used to run pairwise contrasts 
levels(dds.DESeq.treatment$treatment)
## [1] "ambient"        "moderate_long"  "moderate_short" "severe_long"   
## [5] "severe_short"

Identify differentially expressed genes among contrasts

print("Comparison: Ambient pH vs. moderate-short")
## [1] "Comparison: Ambient pH vs. moderate-short"
summary(res.mod.short <- results(dds.DESeq.treatment, contrast=c("treatment", "ambient", "moderate_short"), alpha=0.05))
## 
## out of 11491 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 533, 4.6%
## LFC < 0 (down)     : 529, 4.6%
## outliers [1]       : 0, 0%
## low counts [2]     : 1337, 12%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, 3C:",  sum(res.mod.short$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, 3C: 1062"
print("Comparison: Ambient pH vs. moderate-long")
## [1] "Comparison: Ambient pH vs. moderate-long"
summary(res.mod.long <- results(dds.DESeq.treatment, contrast=c("treatment", "ambient", "moderate_long"), alpha=0.05))
## 
## out of 11491 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1072, 9.3%
## LFC < 0 (down)     : 917, 8%
## outliers [1]       : 0, 0%
## low counts [2]     : 1337, 12%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, 3C:",  sum(res.mod.long$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, 3C: 1989"
print("Comparison: Ambient pH vs. severe-short")
## [1] "Comparison: Ambient pH vs. severe-short"
summary(res.sev.short <- results(dds.DESeq.treatment, contrast=c("treatment", "ambient", "severe_short"), alpha=0.05))
## 
## out of 11491 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 20, 0.17%
## LFC < 0 (down)     : 27, 0.23%
## outliers [1]       : 0, 0%
## low counts [2]     : 7575, 66%
## (mean count < 319)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, 3C:",  sum(res.sev.short$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, 3C: 47"
print("Comparison: Ambient pH vs. severe-long")
## [1] "Comparison: Ambient pH vs. severe-long"
summary(res.sev.long <- results(dds.DESeq.treatment, contrast=c("treatment", "ambient", "severe_long"), alpha=0.05))
## 
## out of 11491 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 287, 2.5%
## LFC < 0 (down)     : 239, 2.1%
## outliers [1]       : 0, 0%
## low counts [2]     : 2451, 21%
## (mean count < 16)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, 3C:",  sum(res.sev.long$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, 3C: 526"

Total number of genes analyzed

nrow(res.sev.long)
## [1] 11491
#### Save differential expression results to objects 
save(res.mod.short, file = "../results/deseq2/res.mod.short")
save(res.mod.long, file = "../results/deseq2/res.mod.long")
save(res.sev.short, file = "../results/deseq2/res.sev.short")
save(res.sev.long, file = "../results/deseq2/res.sev.long")
#### Subset the DESeq results for only those statistically sign. ones
# Could also filter to only include those with abs(L2FC)>0.5, but that might filter out interesting genes - 
diffex.mod.short <- subset(res.mod.short, padj < 0.05) # & abs(log2FoldChange)>0.5
diffex.mod.long <- subset(res.mod.long, padj < 0.05)
diffex.sev.short <- subset(res.sev.short, padj < 0.05)
diffex.sev.long <- subset(res.sev.long, padj < 0.05)

Examine potential influence of outliers

Inspect Cook’s distances for all genes.

par(mar=c(8,5,2,2))
boxplot(log10(assays(dds.DESeq.treatment)[["cooks"]]), range=0, las=2)

Identify outliers using iLOO

DESeq2 has integrated outlier detection. But, after inspecting DEG’s, there do appear to be a few influenced by outlier samples. So, I also perform outlier detection using the iterative Leave-One-Out method (iLOO) and script from George et al. 2015, see iLOO.R script

Note: The function iLOO requires an input matrix, which represents a matrix of read counts for a treatment or homogeneous group containing samples and features. iLOO returns a matrix, where only the outlier read counts are present (all non-outlier values have been NA’ed).

#sample.info$treatment %>% levels()

# Create vectors of sample IDs per treatment
samples.ambient <- (sample.info[c("sample", "treatment")] %>% filter(treatment=="ambient"))$sample
samples.moderate_long <- (sample.info[c("sample", "treatment")] %>% filter(treatment=="moderate_long"))$sample
samples.moderate_short <- (sample.info[c("sample", "treatment")] %>% filter(treatment=="moderate_short"))$sample
samples.severe_long <- (sample.info[c("sample", "treatment")] %>% filter(treatment=="severe_long"))$sample
samples.severe_short <- (sample.info[c("sample", "treatment")] %>% filter(treatment=="severe_short"))$sample

# Create vector of all DEGs identified among any contrast 
degs <- unique(c(rownames(diffex.mod.short),rownames(diffex.mod.long),rownames(diffex.sev.short), rownames(diffex.sev.long)))
length(degs)
## [1] 2724
# Extract gene counts that have outliers replaced with trimmed means, and filter for DEGs
degs.counts <- assays(dds.DESeq.treatment)[["replaceCounts"]] %>% as.data.frame() %>% rownames_to_column("gene") %>% filter(gene %in% degs)

# Extract counts for 
counts.ambient <- degs.counts %>% select(gene, all_of(samples.ambient)) %>% column_to_rownames("gene")
counts.moderate_long <- degs.counts %>% select(gene, all_of(samples.moderate_long)) %>% column_to_rownames("gene")
counts.moderate_short <- degs.counts %>% select(gene, all_of(samples.moderate_short)) %>% column_to_rownames("gene")
counts.severe_long <- degs.counts %>% select(gene, all_of(samples.severe_long)) %>% column_to_rownames("gene")
counts.severe_short <- degs.counts %>% select(gene, all_of(samples.severe_short)) %>% column_to_rownames("gene")

# Run the iterative Leave-One-Out function on gene counts for each treatment 
iLOO.ambient <- iLOO(counts.ambient)
iLOO.moderate_long <- iLOO(counts.moderate_long)
iLOO.moderate_short <- iLOO(counts.moderate_short)
iLOO.severe_long <- iLOO(counts.severe_long)
iLOO.severe_short <- iLOO(counts.severe_short)

# find samples with lots of outliers in DEG lists 
iLOO.ambient %>% as.data.frame() %>% rownames_to_column("gene") %>% 
  pivot_longer(-gene) %>% filter(!is.na(value)) %>%
  mutate(name=as.factor(name)) %>%
  filter(gene %in% c(rownames(diffex.mod.short), rownames(diffex.mod.long),
                     rownames(diffex.sev.short), rownames(diffex.sev.long))) %>% 
  group_by(name) %>% tally() %>% arrange(dplyr::desc(n))
## # A tibble: 11 × 2
##    name      n
##    <fct> <int>
##  1 S12      14
##  2 S4       12
##  3 S11       6
##  4 S6        6
##  5 S3        3
##  6 S5        2
##  7 S7        2
##  8 S1        1
##  9 S10       1
## 10 S2        1
## 11 S8        1
iLOO.moderate_long %>% as.data.frame() %>% rownames_to_column("gene") %>% 
  pivot_longer(-gene) %>% filter(!is.na(value)) %>%
  mutate(name=as.factor(name)) %>%
  filter(gene %in% c(rownames(diffex.mod.long))) %>% 
  group_by(name) %>% tally() %>% arrange(dplyr::desc(n))
## # A tibble: 9 × 2
##   name      n
##   <fct> <int>
## 1 S26      40
## 2 S27      40
## 3 S17      13
## 4 S24      12
## 5 S21       3
## 6 S22       3
## 7 S14       1
## 8 S15       1
## 9 S19       1
iLOO.moderate_short %>% as.data.frame() %>% rownames_to_column("gene") %>% 
  pivot_longer(-gene) %>% filter(!is.na(value)) %>%
  mutate(name=as.factor(name)) %>%
  filter(gene %in% c(rownames(diffex.mod.short))) %>%
  group_by(name) %>% tally() %>% arrange(dplyr::desc(n))
## # A tibble: 8 × 2
##   name      n
##   <fct> <int>
## 1 S32      43
## 2 S38       9
## 3 S31       4
## 4 S28       2
## 5 S30       2
## 6 S36       2
## 7 S35       1
## 8 S37       1
iLOO.severe_short %>% as.data.frame() %>% rownames_to_column("gene") %>% 
  pivot_longer(-gene) %>% filter(!is.na(value)) %>%
  mutate(name=as.factor(name)) %>%
  filter(gene %in% c(rownames(diffex.sev.short))) %>%
  group_by(name) %>% tally() %>% arrange(dplyr::desc(n))
## # A tibble: 0 × 2
## # … with 2 variables: name <fct>, n <int>
iLOO.severe_long %>% as.data.frame() %>% rownames_to_column("gene") %>% 
  pivot_longer(-gene) %>% filter(!is.na(value)) %>%
  mutate(name=as.factor(name)) %>%
  filter(gene %in% c(rownames(diffex.sev.long))) %>% 
  group_by(name) %>% tally() %>% arrange(dplyr::desc(n))
## # A tibble: 4 × 2
##   name      n
##   <fct> <int>
## 1 S49       8
## 2 S44       6
## 3 S9        5
## 4 S43       4

Generate lists of degs that have an outlier in at least one sample

# Ambient vs. moderate-short
outs.diffex.mod.short <- full_join(
  iLOO.ambient[rowSums(is.na(iLOO.ambient)) != ncol(iLOO.ambient), ] %>% 
  as.data.frame() %>% rownames_to_column("gene") %>%
  filter(gene %in% rownames(diffex.mod.short)),

iLOO.moderate_short[rowSums(is.na(iLOO.moderate_short)) != ncol(iLOO.moderate_short), ] %>% 
  as.data.frame() %>% rownames_to_column("gene") %>%
  filter(gene %in% rownames(diffex.mod.short)),
by="gene")

# Ambient vs. moderate-long
outs.diffex.mod.long <- full_join(
  iLOO.ambient[rowSums(is.na(iLOO.ambient)) != ncol(iLOO.ambient), ] %>% 
  as.data.frame() %>% rownames_to_column("gene") %>%
  filter(gene %in% rownames(diffex.mod.long)),

iLOO.moderate_long[rowSums(is.na(iLOO.moderate_long)) != ncol(iLOO.moderate_long), ] %>% 
  as.data.frame() %>% rownames_to_column("gene") %>%
  filter(gene %in% rownames(diffex.mod.long)),
by="gene")

# Ambient vs. severe-short
outs.diffex.sev.short <- full_join(
  iLOO.ambient[rowSums(is.na(iLOO.ambient)) != ncol(iLOO.ambient), ] %>% 
  as.data.frame() %>% rownames_to_column("gene") %>%
  filter(gene %in% rownames(diffex.sev.short)),

iLOO.severe_short[rowSums(is.na(iLOO.severe_short)) != ncol(iLOO.severe_short), ] %>% 
  as.data.frame() %>% rownames_to_column("gene") %>%
  filter(gene %in% rownames(diffex.sev.short)),
by="gene")

# Ambient vs. secere-short
outs.diffex.sev.long <- full_join(
  iLOO.ambient[rowSums(is.na(iLOO.ambient)) != ncol(iLOO.ambient), ] %>% 
  as.data.frame() %>% rownames_to_column("gene") %>%
  filter(gene %in% rownames(diffex.sev.long)),

diffex.sev.long[rowSums(is.na(diffex.sev.long)) != ncol(diffex.sev.long), ] %>% 
  as.data.frame() %>% rownames_to_column("gene") %>%
  filter(gene %in% rownames(diffex.sev.long)),
by="gene")

How many genes are going to be removed due to outlier effects?

c(outs.diffex.mod.short$gene, 
  outs.diffex.mod.short$gene,
  outs.diffex.sev.short$gene,
  outs.diffex.sev.long$gene) %>% unique() %>% length()
## [1] 592

Generate final list of DEGs that will be used for subsequent analyses & results!

Remove genes from DEG lists that contain iLOO-identified outliers from DEG objects, and save to file.

(diffex.mod.short.filt <- subset(diffex.mod.short, rownames(diffex.mod.short) %!in% c(outs.diffex.mod.short$gene)))
## log2 fold change (MLE): treatment ambient vs moderate_short 
## Wald test p-value: treatment ambient vs moderate_short 
## DataFrame with 990 rows and 6 columns
##             baseMean log2FoldChange     lfcSE      stat      pvalue      padj
##            <numeric>      <numeric> <numeric> <numeric>   <numeric> <numeric>
## gene_22086  163.4332       0.457994  0.141182   3.24400 0.001178650 0.0240594
## gene_17987  265.4243       0.833411  0.229862   3.62571 0.000288172 0.0134225
## gene_15688  114.7739       0.722213  0.208538   3.46321 0.000533762 0.0168004
## gene_24921  120.4863      -0.488948  0.160532  -3.04580 0.002320594 0.0331624
## gene_3171    67.1868       0.566857  0.167939   3.37538 0.000737148 0.0197493
## ...              ...            ...       ...       ...         ...       ...
## gene_3243    7.58741        1.18820  0.387205   3.06867 0.002150175 0.0316878
## gene_14299   7.73821       -1.77428  0.610481  -2.90637 0.003656498 0.0419260
## gene_2281    7.52911        1.24472  0.407544   3.05420 0.002256602 0.0325563
## gene_17639   7.45038       -2.47603  0.743645  -3.32958 0.000869762 0.0209776
## gene_40840   8.76357        1.01936  0.357289   2.85303 0.004330387 0.0452227
(diffex.mod.long.filt <- subset(diffex.mod.long, rownames(diffex.mod.long) %!in% c(outs.diffex.mod.long$gene)))
## log2 fold change (MLE): treatment ambient vs moderate_long 
## Wald test p-value: treatment ambient vs moderate_long 
## DataFrame with 1854 rows and 6 columns
##             baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##            <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## gene_24921  120.4863      -0.667364 0.1539545  -4.33481 1.45885e-05 0.000627676
## gene_41869   56.4835      -0.503796 0.1476097  -3.41303 6.42447e-04 0.008144084
## gene_21363  668.8843      -0.227074 0.0833102  -2.72564 6.41773e-03 0.037799117
## gene_23332   12.8703       0.999718 0.3279471   3.04841 2.30054e-03 0.018799339
## gene_23631  162.9875      -0.643769 0.2265273  -2.84191 4.48446e-03 0.029510837
## ...              ...            ...       ...       ...         ...         ...
## gene_38607   8.70204       0.989363  0.325078   3.04346 0.002338737  0.01902847
## gene_991     9.12787       1.563670  0.491829   3.17929 0.001476350  0.01395797
## gene_23381   8.75754      -1.014250  0.363173  -2.79275 0.005226275  0.03277801
## gene_31445   7.50957       1.053183  0.360920   2.91805 0.003522311  0.02483719
## gene_45082   9.34256      -2.147658  0.579019  -3.70913 0.000207971  0.00384174
(diffex.sev.short.filt <- subset(diffex.sev.short, rownames(diffex.sev.short) %!in% c(diffex.sev.short$gene)))
## log2 fold change (MLE): treatment ambient vs severe_short 
## Wald test p-value: treatment ambient vs severe_short 
## DataFrame with 47 rows and 6 columns
##             baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##            <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## gene_9733    919.971       0.388296 0.1069915   3.62922 2.84282e-04 0.043647033
## gene_17507   364.966       0.910460 0.1742690   5.22445 1.74671e-07 0.000342006
## gene_20451   794.996      -0.351901 0.0955198  -3.68406 2.29546e-04 0.040859221
## gene_25779  1048.888       1.408023 0.3499245   4.02379 5.72687e-05 0.021768821
## gene_30442   581.969      -0.500407 0.1449462  -3.45236 5.55703e-04 0.048565551
## ...              ...            ...       ...       ...         ...         ...
## gene_14493 46116.276      -0.406327  0.114958  -3.53459 0.000408415   0.0436470
## gene_22162  3319.819       0.484070  0.140261   3.45121 0.000558082   0.0485656
## gene_28677  1146.876       0.688146  0.193193   3.56196 0.000368090   0.0436470
## gene_18867 10148.797      -0.393208  0.109324  -3.59671 0.000322267   0.0436470
## gene_5901    857.875      -0.587951  0.167998  -3.49975 0.000465697   0.0455917
(diffex.sev.long.filt <- subset(diffex.sev.long, rownames(diffex.sev.long) %!in% c(diffex.sev.long$gene)))
## log2 fold change (MLE): treatment ambient vs severe_long 
## Wald test p-value: treatment ambient vs severe_long 
## DataFrame with 526 rows and 6 columns
##             baseMean log2FoldChange     lfcSE      stat      pvalue       padj
##            <numeric>      <numeric> <numeric> <numeric>   <numeric>  <numeric>
## gene_38613  282.1055       0.362134  0.120267   3.01108 2.60324e-03 0.04735073
## gene_12130   48.2706       0.944692  0.237722   3.97393 7.06971e-05 0.00879515
## gene_19035  452.3958      -0.784800  0.168862  -4.64760 3.35822e-06 0.00131993
## gene_21303  179.8877      -0.406202  0.135245  -3.00346 2.66924e-03 0.04806765
## gene_28091  351.8465      -0.326202  0.108757  -2.99935 2.70557e-03 0.04839945
## ...              ...            ...       ...       ...         ...        ...
## gene_33802  186.7508       0.716785  0.230807   3.10556 1.89918e-03 0.04090794
## gene_20756   61.1330       1.023421  0.291222   3.51423 4.41025e-04 0.01981272
## gene_45528  432.3796       1.956336  0.491196   3.98280 6.81071e-05 0.00879515
## gene_11617  362.7768       0.476949  0.140959   3.38359 7.15442e-04 0.02453651
## gene_39972   24.4192       1.726614  0.523343   3.29920 9.69606e-04 0.02921745
save(diffex.mod.short.filt, file = "../results/deseq2/diffex.mod.short.filt")
save(diffex.mod.long.filt, file = "../results/deseq2/diffex.mod.long.filt")
save(diffex.sev.short.filt, file = "../results/deseq2/diffex.sev.short.filt")
save(diffex.sev.long.filt, file = "../results/deseq2/diffex.sev.long.filt")

Examine top differentially expressed genes in each treatment

Plot counts NOTE: using the DeSeq2 function plotCounts plots original counts, i.e. those that are normalized, but outlier values are NOT replaced with trimmed means. I’m really interested to see how the counts look that DESeq2 is analyzing, i.e. I want to see the outlier-replaced counts. So, I must include the argument replaced=TRUE in the plotCounts function.

It’s good to look at the gene expression value - if counts are low (e.g. less than 50 or so) that gene isn’t as interesting as one with mucn larger counts!

Top 15 annotated DEGs, MODERATE OA (pH 7.8) SHORT Exposure (green)

## # A tibble: 15 × 5
##    gene_id       padj spid     evalue protein_names                             
##    <chr>        <dbl> <chr>     <dbl> <chr>                                     
##  1 gene_9105  0.00156 Q08603 7.12e-29 Geranylgeranyl transferase type-2 subunit…
##  2 gene_31640 0.00156 P62083 6.7 e-23 Small ribosomal subunit protein eS7 (40S …
##  3 gene_26230 0.00156 Q9XYN1 7.23e-27 Innexin inx2 (Innexin-2) (G-Inx2)         
##  4 gene_1280  0.00212 Q962X9 2.69e-25 Protein BUD31 homolog (Protein G10 homolo…
##  5 gene_21431 0.00283 A7SMW7 5.36e-18 L-2-hydroxyglutarate dehydrogenase, mitoc…
##  6 gene_27663 0.00295 Q8C428 9.39e-18 Transmembrane channel-like protein 7      
##  7 gene_19111 0.00295 Q9UL12 6.96e-66 Sarcosine dehydrogenase, mitochondrial (S…
##  8 gene_19034 0.00406 Q9ULK2 1.99e-13 Ataxin-7-like protein 1 (Ataxin-7-like pr…
##  9 gene_3111  0.00406 P21328 1.22e-13 RNA-directed DNA polymerase from mobile e…
## 10 gene_3494  0.00406 Q5ZM14 1.28e-16 Na(+)/H(+) exchange regulatory cofactor N…
## 11 gene_38520 0.00406 Q642P2 1.81e-18 Protein dopey-2                           
## 12 gene_29559 0.00433 Q8R420 3.03e-35 Phospholipid-transporting ATPase ABCA3 (E…
## 13 gene_39860 0.00433 Q9MAQ3 5.95e-20 Putative ubiquitin carboxyl-terminal hydr…
## 14 gene_31413 0.00468 P0CB99 1.87e-11 NADH dehydrogenase [ubiquinone] 1 alpha s…
## 15 gene_12849 0.00477 Q94793 6.66e-24 Large ribosomal subunit protein uL5 (60S …

Top 15 annotated DEGs, MODERATE OA (pH 7.8) LONG Exposure (yellow)

## # A tibble: 15 × 5
##    gene_id        padj spid      evalue protein_names                           
##    <chr>         <dbl> <chr>      <dbl> <chr>                                   
##  1 gene_30576 2.62e-27 Q4R6M5 2.49e- 20 Probable ATP-dependent RNA helicase DDX…
##  2 gene_8881  6.75e-16 P30414 7.58e- 15 NK-tumor recognition protein (NK-TR pro…
##  3 gene_13021 1.98e-14 Q5RC80 4.07e- 31 RNA-binding protein 39 (RNA-binding mot…
##  4 gene_11011 2.98e-13 Q52KI8 1.87e- 13 Serine/arginine repetitive matrix prote…
##  5 gene_28510 2.98e-13 Q02040 4.41e- 15 A-kinase anchor protein 17A (AKAP-17A) …
##  6 gene_44742 6.53e-12 Q92734 4.8 e- 13 Protein TFG (TRK-fused gene protein)    
##  7 gene_14632 4.51e-10 Q9UKX7 4.87e- 24 Nuclear pore complex protein Nup50 (50 …
##  8 gene_19734 3.11e- 9 A7MB89 3.69e- 72 Protein fem-1 homolog C (FEM1c) (FEM1-g…
##  9 gene_17622 5.48e- 9 Q9I8F9 0         Heat shock 70 kDa protein 1 (HSP70-1)   
## 10 gene_13542 1.61e- 8 Q9I8F9 0         Heat shock 70 kDa protein 1 (HSP70-1)   
## 11 gene_28605 1.61e- 8 P53486 2.06e-140 Actin, cytoplasmic 3 (EC 3.6.4.-) (Beta…
## 12 gene_2928  9.78e- 8 A2VEY9 1.02e- 18 Palmitoyltransferase app (EC 2.3.1.225)…
## 13 gene_29217 1.73e- 7 Q02290 1.21e- 74 Endo-1,4-beta-xylanase B (Xylanase B) (…
## 14 gene_25041 2.32e- 7 Q71SY6 5.11e- 15 Out at first protein homolog (Protein V…
## 15 gene_33947 2.88e- 7 P52485 1.06e- 33 Ubiquitin-conjugating enzyme E2-24 kDa …

Top 15 annotated DEGs, SEVERE OA (pH 7.5) SHORT Exposure (orange)

## # A tibble: 15 × 5
##    gene_id         padj spid     evalue protein_names                           
##    <chr>          <dbl> <chr>     <dbl> <chr>                                   
##  1 gene_45978 0.0000930 Q02375 3.68e-12 NADH dehydrogenase [ubiquinone] iron-su…
##  2 gene_17507 0.000342  Q25434 6.64e-24 Adhesive plaque matrix protein (Foot pr…
##  3 gene_4166  0.00623   P31161 9.51e-42 Superoxide dismutase [Mn] 1, mitochondr…
##  4 gene_42525 0.00637   P19835 9.27e-12 Bile salt-activated lipase (BAL) (EC 3.…
##  5 gene_45077 0.00637   P24156 4.59e-12 Prohibitin 1 (Protein l(2)37Cc)         
##  6 gene_38799 0.00637   O00871 2.22e-14 Phosphoglycerate kinase (EC 2.7.2.3)    
##  7 gene_41481 0.00637   Q6PB44 3.91e-18 Tyrosine-protein phosphatase non-recept…
##  8 gene_16724 0.0218    Q96PE7 4.35e-25 Methylmalonyl-CoA epimerase, mitochondr…
##  9 gene_14971 0.0316    Q9VNX8 5.28e-14 Eukaryotic translation initiation facto…
## 10 gene_34598 0.0351    A4Z944 4.98e-21 Zinc finger BED domain-containing prote…
## 11 gene_32125 0.0372    Q96P09 1.82e-13 Baculoviral IAP repeat-containing prote…
## 12 gene_2973  0.0372    Q9VJ38 2.75e-21 Large ribosomal subunit protein uL13m (…
## 13 gene_37254 0.0372    Q1HPL8 3.38e-12 NADH dehydrogenase [ubiquinone] 1 beta …
## 14 gene_20451 0.0409    Q02375 3.68e-12 NADH dehydrogenase [ubiquinone] iron-su…
## 15 gene_40464 0.0409    Q5D891 4.67e-87 Phosphatidylinositol 3-kinase catalytic…

Top 15 annotated DEGs, SEVERE OA (pH 7.5) LONG Exposure (red)

## # A tibble: 15 × 5
##    gene_id          padj spid     evalue protein_names                          
##    <chr>           <dbl> <chr>     <dbl> <chr>                                  
##  1 gene_39883 0.00000883 Q8BTN6 1.51e-25 Leukocyte receptor cluster member 9    
##  2 gene_11011 0.0000211  Q52KI8 1.87e-13 Serine/arginine repetitive matrix prot…
##  3 gene_14632 0.0000220  Q9UKX7 4.87e-24 Nuclear pore complex protein Nup50 (50…
##  4 gene_1130  0.0000256  Q5R685 9.65e-43 Phosphatidylinositol 3-kinase regulato…
##  5 gene_21525 0.0000805  G3X9Z4 2.73e-16 Pre-mRNA cleavage complex 2 protein Pc…
##  6 gene_13021 0.000264   Q5RC80 4.07e-31 RNA-binding protein 39 (RNA-binding mo…
##  7 gene_32373 0.000362   Q9USH8 1.64e-18 Glucosidase 2 subunit beta (Alpha-gluc…
##  8 gene_3253  0.000544   Q0KK59 6.26e-15 Protein unc-79 homolog                 
##  9 gene_46423 0.000833   F4ILR7 8.28e-21 DExH-box ATP-dependent RNA helicase DE…
## 10 gene_13289 0.000997   Q7K3M5 3.45e-65 ATP-dependent RNA helicase DHX15 homol…
## 11 gene_30576 0.00112    Q4R6M5 2.49e-20 Probable ATP-dependent RNA helicase DD…
## 12 gene_29874 0.00123    P35509 4.34e-56 Casein kinase I isoform gamma-3 (CKI-g…
## 13 gene_19035 0.00132    O14730 5.68e-17 Serine/threonine-protein kinase RIO3 (…
## 14 gene_21288 0.00149    Q9NDJ2 2.08e-59 Helicase domino (EC 3.6.4.-)           
## 15 gene_2928  0.00155    A2VEY9 1.02e-18 Palmitoyltransferase app (EC 2.3.1.225…

Summary stats for the number of Differentially Expressed Genes (DEGs)

Bar plot of the number of DEGs identified in each contrast

# write simple function to return the name of an object as a string

# Here is a list containing all DEG sets - this will be used in subsequent code chunks, too! 
degs <- list(
diffex.mod.short.filt,
diffex.mod.long.filt, 
diffex.sev.short.filt, 
diffex.sev.long.filt)

names(degs) <- c("Moderate OA - Short", "Moderate OA - Long", 
                 "Severe OA - Short", "Severe OA - Long")

n.degs <- data.frame(matrix(NA, nrow = length(degs), ncol = 3))
names(n.degs) <- c("contrast", "n.degs", "perc")
for (i in (1:length(degs))) {
  n.degs[i,1] <- names(degs[i])
  n.degs[i,2] <- nrow(degs[[i]])
  n.degs[i,3] <- 100*round(nrow(degs[[i]])/ncol(counts.ts), digits = 3)
}

n.degs$contrast <- factor(n.degs$contrast, levels=n.degs$contrast, ordered = T)

save(degs, file = "../results/deseq2/degs")

Barplot of the number of DEGs

# single stressors vs. multiple stressors 
#pdf(file="../results/deseq2/DEGs_barplot_single-vs-double-stressor.pdf", height = 2, width = 7)
ggplot(n.degs, 
       # %>% 
       #   filter(contrast %in% c("Amb vs. Low pH @6C",
       #                          "3 vs. 6C @Amb pH", "6 amb vs. 3C low",
       #                          "6 vs. 10C @Amb pH", "6 amb vs. 10 low")) %>%
       #   mutate(effect_of=case_when(
       #     grepl("Amb vs. Low pH @6C", contrast) ~ "High pCO2",
       #     grepl("3 vs. 6C", contrast) ~ "Cold temperature",
       #     grepl("6 amb vs. 3C low", contrast) ~ "Cold temperature + High pCO2",
       #     grepl("6 amb vs. 10 low", contrast) ~ "Warm temperature + High pCO2",
       #     grepl("6 vs. 10C", contrast) ~ "Warm temperature")) %>% 
       #  mutate(effect_of=fct_relevel(effect_of,
       #     c("Cold temperature + High pCO2", "Cold temperature",
       #       "Warm temperature + High pCO2", "Warm temperature",
       #       "High pCO2"))), 
       aes(x=contrast, y=n.degs, fill=contrast)) + 
  geom_bar(alpha=0.8, stat = "identity", color="gray20", size=0.1) +
  geom_text(hjust=-0.1, size=3.5, aes(label=paste(comma(n.degs), " (", perc, "%)", sep=""))) + 
  xlab("Contrast") + ylab("Number Differentially Expressed Genes (% of all genes)") + 
  xlab(NULL) + ylab(NULL) + coord_flip() + ggtitle("Number Differentially Expressed Genes (% of all genes)") +
  theme_minimal() + ylim(c(0,2150)) + 
  theme(legend.position = "none", plot.title = element_text(size = 10)) +
  scale_fill_manual(name="Treatment",
                     values=c("Moderate OA - Short"="darkgreen",
                              "Moderate OA - Long"="yellow3",
                              "Severe OA - Short"="sienna2",
                              "Severe OA - Long"="firebrick4"),
                     labels=c("Moderate OA - Short"="pH 7.8, Short Exposure",
                              "Moderate OA - Long"="pH 7.8, Long Exposure",
                              "Severe OA - Short"="pH 7.5, Short Exposure",
                              "Severe OA - Long"="pH 7.5, Long Exposure")) +
  guides(fill=guide_legend(nrow=3,byrow=TRUE,reverse=TRUE)) 

#dev.off()

Regenerate barplot for paper (Figure 3) with the number of upregulated / downregualted DEGs by treatment

# summary(res.mod.short)
# summary(res.mod.long)
# summary(res.sev.short)
# summary(res.sev.long)
degs.updown <- list(
diffex.mod.short.filt %>% data.frame() %>% filter(log2FoldChange > 0),
diffex.mod.short.filt %>% data.frame() %>% filter(log2FoldChange < 0),
diffex.mod.long.filt %>% data.frame() %>% filter(log2FoldChange > 0),
diffex.mod.long.filt %>% data.frame() %>% filter(log2FoldChange < 0),
diffex.sev.short.filt %>% data.frame() %>% filter(log2FoldChange > 0),
diffex.sev.short.filt %>% data.frame() %>% filter(log2FoldChange < 0),
diffex.sev.long.filt %>% data.frame() %>% filter(log2FoldChange > 0),
diffex.sev.long.filt %>% data.frame() %>% filter(log2FoldChange < 0))
names(degs.updown) <- c("Moderate OA - Short", "Moderate OA - Short", 
                        "Moderate OA - Long", "Moderate OA - Long",
                        "Severe OA - Short", "Severe OA - Short",
                        "Severe OA - Long", "Severe OA - Long")

n.degs.updown <- data.frame(matrix(NA, nrow = length(degs.updown), ncol = 3))
names(n.degs.updown) <- c("stressor", "n.degs", "perc")
for (i in (1:length(degs.updown))) {
  n.degs.updown[i,1] <- names(degs.updown[i])
  n.degs.updown[i,2] <- nrow(degs.updown[[i]])
  n.degs.updown[i,3] <- 100*round(nrow(degs.updown[[i]])/ncol(counts.ts), digits = 2)
}

n.degs.updown <- n.degs.updown %>%
  mutate(stressor=factor(stressor, levels=
                           c("Moderate OA - Short", "Moderate OA - Long", 
                 "Severe OA - Short", "Severe OA - Long")),
         response=rep(c("up", "down"), times=4))

# Barplot with no. of DEGs by single stressors vs. multiple stressors, including up/down regulation 
#pdf(file="../results/deseq2/DEGs_barplot_single-vs-double-stressor-up-down.pdf", height = 2, width = 7)
ggplot(n.degs.updown, 
       aes(x=stressor, y=n.degs, fill=stressor)) + 
  geom_bar_pattern(aes(pattern=response), alpha=0.75, stat = "identity", color="gray20", size=0.05,
                   pattern_fill="black", pattern_alpha=0.2, pattern_density=0.01, pattern_spacing=0.025) +
  xlab("Contrast") + ylab("Number Differentially Expressed Genes (% of all genes)") + 
  xlab(NULL) + ylab(NULL) + coord_flip() + ggtitle("Number Differentially Expressed Genes (% of all genes)") +
  theme_pubr() + scale_y_continuous(label=comma, limits = c(0,2000)) +
  theme(legend.position = "none", plot.title = element_text(size = 10)) +
  scale_fill_manual(name="Treatment",
                     values=c("Moderate OA - Short"="darkgreen",
                              "Moderate OA - Long"="yellow3",
                              "Severe OA - Short"="sienna2",
                              "Severe OA - Long"="firebrick4"),
                     labels=c("Moderate OA - Short"="pH 7.8, Short Exposure",
                              "Moderate OA - Long"="pH 7.8, Long Exposure",
                              "Severe OA - Short"="pH 7.5, Short Exposure",
                              "Severe OA - Long"="pH 7.5, Long Exposure")) +
  scale_pattern_manual(values = c("stripe","none")) +
  guides(fill=guide_legend(nrow=3,byrow=TRUE,reverse=TRUE)) 

#dev.off()

Heat maps of DEGs

Single heatmap with all DEGs among any treatment

# Generate counts matrix With DEGs among any treatment 

diffex.counts <- subset(assay(vsd.treatment), rownames(dds.DESeq.treatment) %in% unique(c(rownames(diffex.mod.short.filt),rownames(diffex.mod.long.filt),
         rownames(diffex.sev.short.filt), rownames(diffex.sev.long.filt))))

# Create a dataframe to annotate heatmap with treatments 
dds.df <- sample.info[match(colnames(diffex.counts), 
                               sample.info$sampleID),c("sampleID", "treatment")] %>%
  remove_rownames() %>% column_to_rownames(var = "sampleID")

# Make sure treatment order is correct
all(colnames(diffex.counts) == rownames(dds.df)) #double check that samples are still in same order 
## [1] TRUE
# All DEGs among any pH treatment (within temperatures)
pheatmap(diffex.counts, cluster_rows=TRUE, cluster_cols=FALSE,
         show_rownames=FALSE, na.rm=TRUE, annotation_colors = list(treatment=
             c("ambient"= "navyblue", "moderate_short"="darkgreen",
                       "moderate_long"="yellow3","severe_short"="sienna2",
                      "severe_long"="firebrick4")),
         scale="row", main = "Differentially expressed genes (DEGs) among any pH treatment", 
         annotation_col=dds.df[,"treatment", drop=FALSE], fontsize = 8,
         #cutree_rows = 2,  
         border_color=F)

Venn Diagrams of DEGs by pairwise contrast

DAVID enrichment analysis

It is a little clunky, but the DAVID functional analysis tool is one of the best ways to perform enrichment analysis (that I have found to date). To use, you copy a list of Uniprot IDs (or other gene identifier) to clipboard, then paste it into their tool. You then do the same for all genes in the analysis, providing an accurate background list of genes. Here is code to copy all genes, then upregulated and downregulated DEGs identified by each contrast. For background on what an enrichment analysis is, check out this great video.

GENERATE LIST OF DEGS WITH ABS(L2FC)>0.5 (all DEGs, not up.down)

DEGs.4David <- list(
  
  "moderate_short" = 
    enrich.mod.short %>%
    dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
  
  "moderate_long" = 
    enrich.mod.long %>%
    dplyr::select(spid) %>%  na.omit() %>% unlist() %>% as.vector(),

    "severe_short" = 
    enrich.sev.short %>%
    dplyr::select(spid) %>%  na.omit() %>% unlist() %>% as.vector(),
  
  "severe_long" = 
    enrich.sev.long %>%
    dplyr::select(spid) %>%  na.omit() %>% unlist() %>% as.vector()

  ) 

# Write tab-delimited file with each column containing Uniprot IDs for each set of DEGs 
write_delim(x = plyr::ldply(DEGs.4David, rbind) %>% t() %>% as.data.frame(), delim = "\t", col_names = F, na="", 
            file = "../results/deseq2/DEGs-4David.txt")

Run DAVID analysis in browser!

  • Navigate to DAVID on a browser - https://david.ncifcrf.gov/tools.jsp
  • On the left you’ll see “Upload Gene List”
  • For Step 1, use the “B:Choose from a file” and upload the DEGs-updown-4David.txt file that was created in the last code chunk.
  • For the Step 2 dropdown list, select “UNIPROT_ACCESSION”
  • For Step 3, select “Gene List”
  • Select “Submit List”.
  • You then need to upload a background list of genes. This is important. If you don’t, the analysis will assume that you’re comparing the DEG lists to the human genome (You’ll see “Current Background: Homo sapiens”). Instead, we need to upload the list of all genes that we analyzed. To do this, use the below code chunk to copy Uniprot IDs for all genes assessed and paste into the Upload space on DAVID.
# BACKGROUND - all genes submitted to DESeq2 analysis 
enrich.background %>% write_clip(allow_non_interactive = TRUE)

Read in DAVID Enrichment Analysis results

NOTE: outside of R I created a .tab file with the first column containing file names for enrichment results, and second column containing the contrast + which treatment was upregulated (for grouping in R). Example: the following row refers to the file contains enrichment results for genes upregulated in 10C in the contrast 6C vs. 10C at Low pH

#### Read in enriched biological functions for all contrasts

filenames <- read_delim(file="../results/deseq2/GO_filenames.txt", delim = "\t", col_names = c("filename", "gene_set"))
file_list <- vector(mode = "list", length = nrow(filenames))
names(file_list) <- c(filenames$gene_set)

for (i in 1:nrow(filenames)) {
    file_list[[i]] <- data.frame(read_delim(file=paste("../results/deseq2/", filenames[i,"filename"], sep=""), delim = "\t"))}


david <- bind_rows(file_list, .id = "gene_set") %>% clean_names() %>%  # Bind dataframe for each gene_set together
    #filter(category=="GOTERM_BP_DIRECT") %>%
    filter(p_value<0.05) %>%
    dplyr::rename(percent=x) %>%
    separate(term, into = c("term", "process"), sep = "~") %>%
    separate(gene_set, sep = "_", into = c("OA", "duration"), remove = F) %>%
    mutate(treatment=factor(gene_set, ordered = TRUE, levels=c("moderate_short","moderate_long", "severe_short", "severe_long")),
           OA=as.factor(OA), duration=as.factor(duration)) %>% 
  tidyr::complete(OA, duration, category) # add rows for gene_sets that are missing from dataframe

save(david, file="../results/deseq2/david")

Copy DAVID results (Uniprot Kewords and Gene Ontology terms) to clipboard to paste into Google Spreadsheet

# Uniprot keywords
david %>%
  filter(p_value<0.05) %>%
  filter(category=="UP_KW_BIOLOGICAL_PROCESS") %>%
  arrange(OA, duration, p_value) %>%
  mutate_at(c("p_value", "fdr"), funs(signif(.,2))) %>%
  dplyr::select(treatment, OA, duration, term, process, count, p_value, fdr, genes) %>%
  write_clip(allow_non_interactive = TRUE)

# Gene Ontology terms
david %>%
  filter(p_value<0.05) %>%
  filter(category=="GOTERM_BP_DIRECT") %>%
  arrange(OA, duration, p_value) %>%
  mutate_at(c("p_value", "fdr"), funs(signif(.,2))) %>%
  dplyr::select(treatment, OA, duration, term, process, count, p_value, fdr, genes) %>%
  write_clip(allow_non_interactive = TRUE)

DAVID enrichment analysis results figure, all DEGs for each treatment (not separated by up/down-regulated processes)

#pdf(file="../results/deseq2/KW-bubble.pdf", height = 3.05, width = 4)

david %>% #View() 
  #filter(count>=3) %>% # use this to filter for processes that have at minimum n genes
  filter(p_value<0.05) %>% #filter enrichment results for p-value

  # Option 1: Use this line for gene ontology biological processes  
  filter(category=="GOTERM_BP_DIRECT") %>%

  # Option 2: Use this line for Uniprot Keywords
  #filter(category=="UP_KW_BIOLOGICAL_PROCESS") %>%
    
    # Can filter for other options in the "category" column, but might need to adjust the y= in the ggplot(aes). 
    # For example would need to use y=term for KEGG_PATHWAY

  group_by(treatment, genes, count) %>%
  dplyr::summarise(p_value=min(p_value)) %>% distinct() %>% ungroup() %>% #for rows with duplicate grouping variabes, select one with lowest p-value
  left_join(david %>% 
              dplyr::select(treatment, category, term, process, count, percent, p_value, fold_enrichment, fdr, genes), 
            by = c("treatment", "genes", "p_value", "count")) %>%  #re-add data
#  ungroup() %>% arrange(stress, response, p_value) %>% group_by(stress, response) %>% dplyr::slice(1:15) %>%
#  filter(response=="Upregulated") %>% 

  ungroup() %>% arrange(treatment, p_value) %>%
  mutate(process = factor(process, rev(unique(process)), ordered = TRUE)) %>%

ggplot(aes(y = process, x=treatment, col=treatment)) + 
  geom_point(alpha=0.65, aes(size=-log10(p_value))) + #size=count,  alpha=p_value, 
 facet_wrap(~category,scales="free", nrow = 2) +
  scale_color_manual(name="Treatment",
                     values=c("moderate_short"=lighten("darkgreen", 0.25),
                              "moderate_long"=lighten("yellow3", 0.25),
                              "severe_short"=lighten("sienna2"),
                              "severe_long"=lighten("firebrick4")),
                     labels=c("moderate_short"="pH 7.8, Short Exposure",
                              "moderate_long"="pH 7.8, Long Exposure",
                              "severe_short"="pH 7.5, Short Exposure",
                              "severe_long"="pH 7.5, Long Exposure"),
                     guide = guide_legend(override.aes = list(size=4))) +
  scale_size("-Log10 P-value", range = c(2,8), #breaks = c(2, 5, 10),
             guide = guide_legend(override.aes = list(col="gray50"))) +
  scale_x_discrete(drop=T) + #Do drop empty factors for temperature contrasts
  theme_cleveland() + 
  theme(
        legend.position = "right", 
        axis.text.x=element_blank(),
        axis.title.x=element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y=element_text(size=7.25), 
        plot.title = element_text(size=8),
        legend.text=element_text(size=6), 
        legend.title = element_text(size=6),
        strip.background = element_blank(),
        strip.text.x = element_blank()) +
  ggtitle("Enriched Biological Processes")

#dev.off()

Re-run DAVID enrichment analysis on genes that are expressed higher (“upregulated”) and lower (“downregulated”) in each treatment

First, determine which log-fold change (LFC) direction (>0 or <0) refers to higher/lower expression for treatments in each contrast

test <- ((
#  enrich.mod.short %>% 
#  enrich.mod.long %>% 
#  enrich.sev.short %>% 
  enrich.sev.long %>% 
    
#    filter(select(., contains("log2FoldChange"))>.5))$gene_id)
    filter(select(., contains("log2FoldChange"))<(-0.5)))$gene_id)

# Here are treatments 
# "ambient"        "moderate_long"  "moderate_short" "severe_long"    "severe_short"  

a <- c("ambient", "moderate_short")
c <- data.frame(gene=character(), treatment=character(), mean=numeric())
d <- list()

for (i in 1:length(test)) {
b <-  plotCounts(dds.DESeq.treatment, gene=test[i], intgroup=c("treatment"), replaced = TRUE, returnData = TRUE) %>%
  rownames_to_column("sampleID") %>%
  left_join(sample.info[,c("sampleID", "treatment")]) %>%
  filter(treatment %in% a)

c[1:2,1] <- test[i]
c[1:2,2:3] <- summarySE(measurevar = "count", groupvars = "treatment", data = b)[,c("treatment", "count")] %>% 
  mutate(treatment=as.character(treatment))

d[[i]] <- c
}

bind_rows(d, .id = "i") %>% 
  mutate(treat.num=factor(treatment, levels = a, ordered = T),
         gene=as.factor(gene)) %>%
  ggplot(aes(x=treatment, y=log(mean, base = 2), group=gene)) + geom_line(size=.25, color="gray50") +
  theme_minimal() + 
  #scale_x_continuous(breaks=c(1,2), labels=a) +
  theme(axis.title.x = element_blank(), plot.title = element_text(size=10), axis.text.x = element_text(hjust = 0.2))

# RESULTS
# enrich.mod.short - LFC > 0.5 - LOWER IN OA
# enrich.mod.short - LFC < (-0.5) - HIGHER IN OA
# enrich.mod.long - LFC > 0.5 - LOWER IN OA
# enrich.mod.long - LFC < (-0.5) - HIGHER IN OA
# enrich.sev.short - LFC > 0.5 - LOWER IN OA
# enrich.sev.short - LFC < (-0.5) - HIGHER IN OA
# enrich.sev.long - LFC > 0.5 - LOWER IN OA
# enrich.sev.long - LFC < (-0.5) - HIGHER IN OA

Generate .tab input file for DAVID enrichment analysis

This file lists DEGs that are upregulated/downregulated in response to each treatment for DAVID enrichment analysis

# RESULTS

DEGs.updown.4David <- list(
  
  "moderate_short_downregulated" = 
    # enrich.mod.short - LFC > 0.5 - LOWER IN OA
    enrich.mod.short %>%
    filter(dplyr::select(., contains("log2FoldChange"))>0.5) %>%
    dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
  
  "moderate_short_upregulated" =  
    # enrich.mod.short - LFC < (-0.5) - HIGHER IN OA
    enrich.mod.short %>%
    filter(dplyr::select(., contains("log2FoldChange"))<(-0.5)) %>% 
    dplyr::select(spid) %>%  na.omit() %>% unlist() %>% as.vector(), 

  "moderate_long_downregulated" = 
    # enrich.mod.long - LFC > 0.5 - LOWER IN OA
    enrich.mod.long %>%
    filter(dplyr::select(., contains("log2FoldChange"))>0.5) %>%
    dplyr::select(spid) %>%  na.omit() %>% unlist() %>% as.vector(),
  
  "moderate_long_upregulated"  =
    # enrich.mod.long - LFC < (-0.5) - HIGHER IN OA
    enrich.mod.long %>%
    filter(dplyr::select(., contains("log2FoldChange"))<(-0.5)) %>% 
    dplyr::select(spid) %>%  na.omit() %>% unlist() %>% as.vector(),
  
  "severe_short_downregulated" = 
    # enrich.sev.short - LFC > 0.5 - LOWER IN OA
    enrich.sev.short %>%
    filter(dplyr::select(., contains("log2FoldChange"))>0.5) %>%
    dplyr::select(spid) %>%  na.omit() %>% unlist() %>% as.vector(),
  
  "severe_short_upregulated" = 
    # enrich.sev.short - LFC < (-0.5) - HIGHER IN OA
    enrich.sev.short %>%
     filter(dplyr::select(., contains("log2FoldChange"))<(-0.5)) %>% 
    dplyr::select(spid) %>%  na.omit() %>% unlist() %>% as.vector(),
  
  "severe_long_downregulated" = 
    # enrich.sev.long - LFC > 0.5 - LOWER IN OA
    enrich.sev.long %>%
    filter(dplyr::select(., contains("log2FoldChange"))>0.5) %>%
    dplyr::select(spid) %>%  na.omit() %>% unlist() %>% as.vector(),
  
  "severe_long_upregulated"  =
    # enrich.sev.long - LFC < (-0.5) - HIGHER IN OA
    enrich.sev.long %>%
    filter(dplyr::select(., contains("log2FoldChange"))<(-0.5)) %>% 
    dplyr::select(spid) %>%  na.omit() %>% unlist() %>% as.vector()

  ) 

# Write tab-delimited file with each column containing Uniprot IDs for each set of DEGs 
write_delim(x = ldply(DEGs.updown.4David, rbind) %>% t() %>% as.data.frame(), delim = "\t", col_names = F, na="", 
            file = "../results/deseq2/DEGs-updown-4David.txt")
  • Navigate to DAVID on a browser - https://david.ncifcrf.gov/tools.jsp
  • On the left you’ll see “Upload Gene List”
  • For Step 1, use the “B:Choose from a file” and upload the DEGs-updown-4David.txt file that was created in the last code chunk.
  • For the Step 2 dropdown list, select “UNIPROT_ACCESSION”
  • For Step 3, select “Gene List”
  • Select “Submit List”.
  • You then need to upload a background list of genes. This is important. If you don’t, the analysis will assume that you’re comparing the DEG lists to the human genome (You’ll see “Current Background: Homo sapiens”). Instead, we need to upload the list of all genes that we analyzed. To do this, use the below code chunk to copy Uniprot IDs for all genes assessed and paste into the Upload space on DAVID.
# Copy the BACKGROUND - all genes submitted to DESeq2 analysis 
enrich.background %>% write_clip(allow_non_interactive = TRUE)
  • Paste into the white box titled “A:Paste a list” in the “Upload” tab of DAVID.
  • In Step2 select “UNIPROT_ACCESSION”
  • For Step 3 select Background
  • Submit List
  • Select the “Functional Annotation Tool” then the “Functional Annotation Chart” on the next page. A table will pop up. On top-right of table right click “Download File”, then “Save Link As…” and save to file. I used the filename “../results/deseq2/GO_filenames_updown.txt”.

Read in enriched biological functions that were up/down-regulated in treatments

filenames_updown <- read_delim(file="../results/deseq2/GO_filenames_updown.txt", delim = "\t", col_names = c("filename", "gene_set"))
file_list_updown <- vector(mode = "list", length = nrow(filenames_updown))
names(file_list_updown) <- c(filenames_updown$gene_set)

for (i in 1:nrow(filenames_updown)) {
    file_list_updown[[i]] <- data.frame(read_delim(file=paste("../results/deseq2/", filenames_updown[i,"filename"], sep=""), delim = "\t"))}

david_updown <- bind_rows(file_list_updown, .id = "gene_set") %>% clean_names() %>%  # Bind dataframe for each gene_set together
    #filter(category=="GOTERM_BP_DIRECT") %>%
    filter(p_value<0.05) %>%
    dplyr::rename(percent=x) %>%
    separate(term, into = c("term", "process"), sep = "~") %>%
    separate(gene_set, sep = "_", into = c("OA", "duration", "response"), remove = F) %>%
    mutate(treatment= gsub("_upregulated|_downregulated", "", gene_set) %>% factor(., ordered = TRUE, levels=c("moderate_short","moderate_long", "severe_short", "severe_long")),
           OA=as.factor(OA), duration=as.factor(duration), response=as.factor(response)) %>% 
  tidyr::complete(OA, duration, response, category) # add rows for gene_sets that are missing from dataframe

save(david_updown, file="../results/deseq2/david_updown")

Plot enriched terms in UPREGULATED genes

# Bubble plot, Upregulated processes
#pdf(file="../results/GO-enrichment/Final-DAVID/Upregulated-GO-bubble.pdf", height = 9, width = 7.5)
david_updown %>% 
#  filter(count>=3) %>% 
  filter(p_value<0.05) %>% filter(count>=3) %>%
  filter(category=="GOTERM_BP_DIRECT") %>% 
#  filter(category=="UP_KW_BIOLOGICAL_PROCESS") %>%

  group_by(treatment, response, genes, count) %>%
  dplyr::summarise(p_value=min(p_value)) %>% distinct() %>% ungroup() %>% #for rows with duplicate grouping variabes, select one with lowest p-value
  left_join(david_updown %>% 
              dplyr::select(treatment, response, category, term, process, count, percent, p_value, fold_enrichment, fdr, genes), 
            by = c("treatment", "response", "genes", "p_value", "count")) %>%  #re-add data
#  ungroup() %>% arrange(stress, response, p_value) %>% group_by(stress, response) %>% dplyr::slice(1:15) %>%
  filter(response=="upregulated") %>% 

  ungroup() %>% arrange(treatment, p_value) %>%
  mutate(process = factor(process, rev(unique(process)), ordered = TRUE)) %>%

ggplot(aes(y = process, x=treatment, col=treatment)) + 
  geom_point(alpha=0.65, aes(size=-log10(p_value))) + #size=count,  alpha=p_value, 
 facet_wrap(~category,scales="free", nrow = 2) +
  scale_color_manual(name="Treatment",
                     values=c("moderate_short"=lighten("darkgreen", 0.25),
                              "moderate_long"=lighten("yellow3", 0.25),
                              "severe_short"=lighten("sienna2"),
                              "severe_long"=lighten("firebrick4")),
                     labels=c("moderate_short"="pH 7.8, Short Exposure",
                              "moderate_long"="pH 7.8, Long Exposure",
                              "severe_short"="pH 7.5, Short Exposure",
                              "severe_long"="pH 7.5, Long Exposure"),
                     guide = guide_legend(override.aes = list(size=4))) +
  scale_size("-Log10 P-value", range = c(2,8), #breaks = c(2, 5, 10),
             guide = guide_legend(override.aes = list(col="gray50"))) +
  scale_x_discrete(drop=T) + #Do drop empty factors for temperature contrasts
  theme_cleveland() + 
  theme(
        legend.position = "right", 
        axis.text.x=element_blank(),
        axis.title.x=element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y=element_text(size=7.25), 
        plot.title = element_text(size=8),
        legend.text=element_text(size=6), 
        legend.title = element_text(size=6),
        strip.background = element_blank(),
        strip.text.x = element_blank()) +
  ggtitle("Enriched Biological Processes\nin UPREGULATED genes")

Enriched terms in DOWNREGULATED genes

# Bubble plot, Upregulated processes
#pdf(file="../results/GO-enrichment/Final-DAVID/Upregulated-GO-bubble.pdf", height = 9, width = 7.5)
david_updown %>% 
#  filter(count>=3) %>% 
  filter(p_value<0.05) %>% filter(count>=3) %>%
  filter(category=="GOTERM_BP_DIRECT") %>% 
  #filter(category=="UP_KW_BIOLOGICAL_PROCESS") %>%

  group_by(treatment, response, genes, count) %>%
  dplyr::summarise(p_value=min(p_value)) %>% distinct() %>% ungroup() %>% #for rows with duplicate grouping variabes, select one with lowest p-value
  left_join(david_updown %>% 
              dplyr::select(treatment, response, category, term, process, count, percent, p_value, fold_enrichment, fdr, genes), 
            by = c("treatment", "response", "genes", "p_value", "count")) %>%  #re-add data
#  ungroup() %>% arrange(stress, response, p_value) %>% group_by(stress, response) %>% dplyr::slice(1:15) %>%
  filter(response=="downregulated") %>% 

  ungroup() %>% arrange(treatment, p_value) %>%
  mutate(process = factor(process, rev(unique(process)), ordered = TRUE)) %>%

ggplot(aes(y = process, x=treatment, col=treatment)) + 
  geom_point(alpha=0.65, aes(size=-log10(p_value))) + #size=count,  alpha=p_value, 
 facet_wrap(~category,scales="free", nrow = 2) +
  scale_color_manual(name="Treatment",
                     values=c("moderate_short"=lighten("darkgreen", 0.25),
                              "moderate_long"=lighten("yellow3", 0.25),
                              "severe_short"=lighten("sienna2"),
                              "severe_long"=lighten("firebrick4")),
                     labels=c("moderate_short"="pH 7.8, Short Exposure",
                              "moderate_long"="pH 7.8, Long Exposure",
                              "severe_short"="pH 7.5, Short Exposure",
                              "severe_long"="pH 7.5, Long Exposure"),
                     guide = guide_legend(override.aes = list(size=4))) +
  scale_size("-Log10 P-value", range = c(2,8), #breaks = c(2, 5, 10),
             guide = guide_legend(override.aes = list(col="gray50"))) +
  scale_x_discrete(drop=T) + #Do drop empty factors for temperature contrasts
  theme_cleveland() + 
  theme(
        legend.position = "right", 
        axis.text.x=element_blank(),
        axis.title.x=element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y=element_text(size=7.25), 
        plot.title = element_text(size=8),
        legend.text=element_text(size=6), 
        legend.title = element_text(size=6),
        strip.background = element_blank(),
        strip.text.x = element_blank()) +
  ggtitle("Enriched Biological Processes\nin DOWNREGULATED genes")